from langchain.text_splitter import CharacterTextSplitter
from PyPDF2 import PdfReader


def read_text_from_txt(files):
    text = ""
    for file in files:
        text = file.read().decode('utf-8')
    return text


def extract_text_from_PDF(files):
    # 参考官网链接：https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
    # 加载多个PDF文件
    text = ""
    for pdf in files:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def split_content_into_chunks(text):
    # 参考官网链接：https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/character_text_splitter
    text_spliter = CharacterTextSplitter(separator="\n",
                                         chunk_size=500,
                                         chunk_overlap=50,
                                         length_function=len)
    chunks = text_spliter.split_text(text)
    return chunks

def split_content_into_chunks_document(text):
    text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=50)
    documents = text_splitter.split_documents(text)
    return documents
